I am a Senior Bioinformatician and Product Owner with over 5 years of experience bridging the gap between computational biology and pharmaceutical R&D. Currently at 7N, I lead the development of scalable R/Shiny applications and NGS workflows for a Big Pharma client.
I hold a Ph.D.ย in Biological Sciences and am currently pursuing an MBA in Healthcare Management, combining deep technical expertise with strategic business insight.
R & Shiny Python WDL & Nextflow AWS & Cloud Docker SQL & Snowflake NGS Analysis Git Agile / Scrum Product Ownership
Click on a publication card below to reveal the specific Bioinformatics & Statistics methods used.
Below is an interactive gallery of 15 analytical modules demonstrating my coding expertise using realistic, simulated biological datasets. Click the tabs to explore.
Focus: Population structure, GWAS, Variant Quality Control, and Evolution.
Population structure visualization simulating realistic genotype noise.
# Parameters
n_samples <- 300
n_snps <- 1000
# Generate base matrix (noise)
G <- matrix(rnorm(n_samples * n_snps), nrow = n_samples)
# Define 3 populations with subtle shifts in allele frequencies
# Pop 1 (Rows 1-100): Shift in first 100 SNPs
G[1:100, 1:100] <- G[1:100, 1:100] + 0.5
# Pop 2 (Rows 101-200): Shift in SNPs 50-150
G[101:200, 50:150] <- G[101:200, 50:150] - 0.6
# Pop 3 (Rows 201-300): Shift in SNPs 800-900
G[201:300, 800:900] <- G[201:300, 800:900] + 0.7
groups <- c(rep("Holstein", 100), rep("Jersey", 100), rep("Angus", 100))
# Perform PCA
pca_res <- prcomp(G, scale. = TRUE)
var_explained <- round(summary(pca_res)$importance[2, 1:2] * 100, 1)
df_pca <- data.frame(PC1 = pca_res$x[,1], PC2 = pca_res$x[,2], Breed = groups)
ggplot(df_pca, aes(x = PC1, y = PC2, fill = Breed)) +
geom_point(size = 3, shape = 21, color = "white", alpha = 0.7) +
stat_ellipse(aes(color = Breed), geom = "polygon", alpha = 0.1, level = 0.95, show.legend = FALSE) +
scale_fill_manual(values = c("#E69F00", "#56B4E9", "#009E73")) +
scale_color_manual(values = c("#E69F00", "#56B4E9", "#009E73")) +
theme_minimal() +
labs(title = "Population Structure (PCA)",
subtitle = "Simulated genotype matrix (300 samples x 1k SNPs)",
x = paste0("PC1 (", var_explained[1], "% Var)"),
y = paste0("PC2 (", var_explained[2], "% Var)"))
Genome-Wide Association Study with LD-like signal peaks.
n_snps <- 5000
gwas_df <- data.frame(
SNP = paste0("rs", 1:n_snps),
Chr = rep(1:5, each = 1000),
Pos = rep(1:1000, 5),
# Background noise (uniform distribution of p-values)
PVal = runif(n_snps)
)
# Simulate Linkage Disequilibrium (LD) Peak on Chr 2
center <- 1500
width <- 50
peak_signal <- 10^-seq(1, 8, length.out = width)
gwas_df$PVal[(center-width/2):(center+width/2-1)] <- peak_signal * runif(width, 0.1, 1)
gwas_df <- gwas_df %>% mutate(logP = -log10(PVal))
ggplot(gwas_df, aes(x = Pos, y = logP, color = as.factor(Chr))) +
geom_point(alpha = 0.6, size = 1) +
scale_color_brewer(palette = "Set1") +
geom_hline(yintercept = -log10(5e-8), linetype = "dashed", color = "red", alpha = 0.5) +
theme_minimal() +
facet_grid(. ~ Chr, scales = "free_x", space = "free_x", switch = "x") +
theme(axis.text.x = element_blank(), panel.grid.major.x = element_blank(), legend.position = "none") +
labs(title = "Manhattan Plot", y = "-Log10 P-Value", x = "Chromosome")
Quality Control: Realistic skewed distribution of Read Depth.
# Simulate sequencing depth (Negative Binomial distribution is common for read counts)
dp_values <- rnbinom(5000, size = 5, mu = 40)
df_qc <- data.frame(Depth = dp_values)
ggplot(df_qc, aes(x = Depth)) +
geom_histogram(aes(y = ..density..), binwidth = 2, fill = "#34495e", color = "white", alpha = 0.8) +
geom_density(color = "#e74c3c", size = 1, adjust = 2) +
scale_x_continuous(limits = c(0, 150)) +
theme_light() +
labs(title = "Variant Quality Control (DP)",
subtitle = "Distribution of sequencing depth across variants",
x = "Read Depth (DP)", y = "Density") +
geom_vline(xintercept = 40, linetype="dashed", color="orange") +
annotate("text", x = 60, y = 0.02, label = "Mean Depth ~ 40x", color = "orange")
Hierarchical clustering of genetic distances.
# Simulate distance matrix with some structure
m <- matrix(rnorm(100), nrow = 10)
rownames(m) <- paste0("Species_", LETTERS[1:10])
dist_mat <- dist(m)
hc <- hclust(dist_mat, method = "ward.D2")
par(mar=c(2,2,2,2))
plot(hc, hang = -1, main = "Phylogenetic Tree Reconstruction",
sub = "Ward's method clustering", xlab = "",
col = "#2980b9", lwd = 2)
rect.hclust(hc, k = 3, border = "red") # Highlight 3 clades
Focus: Expression profiling, Pathways, and Single-Cell.
Differential Expression: Realistic relationship between Fold Change and P-value.
n_genes <- 3000
# Generate LogFC: mostly close to 0, heavy tails
logfc <- c(rnorm(2500, 0, 0.5), rnorm(250, 2, 1), rnorm(250, -2, 1))
# Generate P-values dependent on LogFC (stronger effect -> smaller p-value)
noise <- rnorm(n_genes, 0, 2)
log_pval <- -(abs(logfc) * 3) + noise
pvals <- 10^log_pval
pvals[pvals > 1] <- 1
df_vol <- data.frame(
Gene = paste0("G", 1:n_genes),
log2FoldChange = logfc,
padj = p.adjust(pvals, method = "BH")
) %>%
mutate(
Group = case_when(
log2FoldChange > 1 & padj < 0.05 ~ "Up-regulated",
log2FoldChange < -1 & padj < 0.05 ~ "Down-regulated",
TRUE ~ "NS"
)
)
ggplot(df_vol, aes(x = log2FoldChange, y = -log10(padj), color = Group)) +
geom_point(alpha = 0.5, size = 1.5) +
scale_color_manual(values = c("steelblue", "grey85", "firebrick")) +
geom_hline(yintercept = -log10(0.05), linetype = "dashed", alpha = 0.5) +
geom_vline(xintercept = c(-1, 1), linetype = "dashed", alpha = 0.5) +
theme_minimal() +
labs(title = "Differential Expression (Volcano Plot)",
subtitle = "Simulated DESeq2 output with FDR correction",
x = "Log2 Fold Change", y = "-Log10 Adjusted P-value")
Simulating cellular trajectories/clusters.
# Create a "trajectory" shape rather than just blobs
t <- seq(0, 2*pi, length.out = 300)
x <- c(cos(t), cos(t) + 2.5, cos(t) + 1.2) + rnorm(900, 0, 0.2)
y <- c(sin(t), sin(t) + 1, sin(t) - 2) + rnorm(900, 0, 0.2)
clusters <- c(rep("Stem", 300), rep("Progenitor", 300), rep("Differentiated", 300))
df_umap <- data.frame(UMAP1 = x, UMAP2 = y, CellType = clusters)
ggplot(df_umap, aes(UMAP1, UMAP2, color = CellType)) +
geom_point(size = 0.8, alpha = 0.6) +
scale_color_manual(values = c("#9b59b6", "#3498db", "#e74c3c")) +
theme_void() +
theme(legend.position = "right") +
labs(title = "Single-Cell UMAP Projection", subtitle = "Developmental Trajectory Simulation")
Dotplot for GSEA results.
df_path <- data.frame(
Pathway = c("Cell Cycle", "DNA Replication", "P53 Signaling", "Apoptosis", "MAPK Signaling", "Ribosome"),
Count = c(45, 30, 25, 20, 15, 10),
GeneRatio = c(0.15, 0.12, 0.09, 0.08, 0.05, 0.03),
p.adjust = c(1e-9, 1e-6, 0.001, 0.01, 0.03, 0.045)
)
df_path$Pathway <- factor(df_path$Pathway, levels = df_path$Pathway[order(df_path$GeneRatio)])
ggplot(df_path, aes(x = GeneRatio, y = Pathway)) +
geom_point(aes(size = Count, color = p.adjust)) +
scale_color_gradient(low = "red", high = "blue") +
theme_light() +
labs(title = "KEGG Pathway Enrichment", x = "Gene Ratio", y = "")
Visualizing co-expression modules.
# Create structured matrix
mat <- matrix(rnorm(400), nrow = 20, ncol = 20)
# Add "modules" of correlation
mat[1:10, 1:10] <- mat[1:10, 1:10] + 2
mat[11:20, 11:20] <- mat[11:20, 11:20] - 2
df_h <- as.data.frame(mat)
colnames(df_h) <- paste0("S", 1:20)
df_h$Gene <- paste0("G", 1:20)
df_h_long <- pivot_longer(df_h, cols = -Gene, names_to = "Sample", values_to = "Exp")
ggplot(df_h_long, aes(Sample, Gene, fill = Exp)) +
geom_tile() +
scale_fill_gradient2(low = "#2c7bb6", mid = "#ffffbf", high = "#d7191c") +
theme_minimal() +
theme(axis.text.x = element_blank()) +
labs(title = "Gene Expression Heatmap", x = "Samples (n=20)", y = "Genes (n=20)")
Focus: Survival, Longitudinal Models, and Correlations.
Time-to-event analysis with realistic censoring.
n <- 100
# Generate Weibull distributed times (more realistic for biological failure)
T_treat <- rweibull(n, shape = 1.5, scale = 100)
T_placebo <- rweibull(n, shape = 1.5, scale = 60) # Placebo dies faster
# Combine
df_surv <- data.frame(
Time = c(T_treat, T_placebo),
Group = rep(c("Treatment", "Placebo"), each = n)
)
# Add random censoring (patients leaving study)
cens_time <- runif(2*n, 0, 150)
df_surv$Status <- ifelse(df_surv$Time < cens_time, 1, 0) # 1=Event, 0=Censored
df_surv$TimeObs <- pmin(df_surv$Time, cens_time)
fit <- survfit(Surv(TimeObs, Status) ~ Group, data = df_surv)
# Plot
plot(fit, col=c("red", "blue"), lwd=3, xlab="Time (Months)", ylab="Survival Probability",
main="Kaplan-Meier Analysis (Simulated Clinical Trial)", frame.plot=FALSE)
legend("topright", levels(factor(df_surv$Group)), col=c("red", "blue"), lwd=3, bty="n")
grid(col="grey90")
Longitudinal data with realistic subject variance.
subjects <- 20
times <- 0:5
# Random intercepts and slopes
intercepts <- rnorm(subjects, 10, 2)
slopes <- rnorm(subjects, 0.5, 0.2)
df_lmm <- data.frame()
for(i in 1:subjects){
# Add group effect: Group B increases faster
grp <- ifelse(i > 10, "B", "A")
slope_eff <- slopes[i] + ifelse(grp=="B", 1.5, 0)
y <- intercepts[i] + slope_eff * times + rnorm(length(times), 0, 1)
df_lmm <- rbind(df_lmm, data.frame(ID=paste0("S",i), Time=times, Value=y, Group=grp))
}
ggplot(df_lmm, aes(x=Time, y=Value, group=ID, color=Group)) +
geom_line(alpha=0.4) +
stat_summary(aes(group=Group), fun=mean, geom="line", size=2) +
scale_color_manual(values=c("#95a5a6", "#e74c3c")) +
theme_bw() +
labs(title="Longitudinal Response (LMM)", subtitle="Group A vs B treatment trajectories")
Clinical variable relationships.
# Simulate correlated multivariate data
sigma <- matrix(c(1, 0.8, -0.5, 0.8, 1, -0.3, -0.5, -0.3, 1), 3, 3)
data_corr <- mvrnorm(n = 50, mu = c(0,0,0), Sigma = sigma)
colnames(data_corr) <- c("BMI", "Insulin", "Activity")
cormat <- round(cor(data_corr), 2)
melted <- as.data.frame(as.table(cormat))
ggplot(melted, aes(Var1, Var2, fill=Freq)) +
geom_tile(color="white") +
geom_text(aes(label=Freq)) +
scale_fill_gradient2(low="#2980b9", mid="white", high="#c0392b") +
theme_minimal() + labs(title="Clinical Correlations", x="", y="")
Focus: Community Diversity and Composition.
Comparing species richness (Non-normal distributions).
# Generate Gamma distributed data (common for counts/diversity)
grp_a <- rgamma(40, shape=20, scale=0.2)
grp_b <- rgamma(40, shape=15, scale=0.2)
df_alpha <- data.frame(Index = c(grp_a, grp_b), Group = rep(c("Healthy", "Dysbiosis"), each=40))
ggplot(df_alpha, aes(x=Group, y=Index, fill=Group)) +
geom_violin(alpha=0.3, trim=FALSE) +
geom_boxplot(width=0.2, alpha=0.8) +
scale_fill_manual(values=c("#e74c3c", "#2ecc71")) +
theme_classic() +
labs(title="Microbiome Alpha Diversity (Shannon)", y="Diversity Index")
Visualizing community separation.
# Create two multivariate normal distributions
g1 <- mvrnorm(25, mu=c(2, 2), Sigma=matrix(c(1,0.5,0.5,1),2))
g2 <- mvrnorm(25, mu=c(-1, -1), Sigma=matrix(c(1,-0.2,-0.2,1),2))
df_beta <- rbind(data.frame(g1, Grp="Ctrl"), data.frame(g2, Grp="Treat"))
colnames(df_beta)[1:2] <- c("PCoA1", "PCoA2")
ggplot(df_beta, aes(PCoA1, PCoA2, color=Grp)) +
geom_point(size=3) +
stat_ellipse() +
theme_minimal() +
labs(title="Beta Diversity (Bray-Curtis PCoA)", subtitle="Clustering of microbial communities")
Focus: Drug Discovery and Machine Learning.
Pharmacodynamics using 4-Parameter Logistic Model.
# Define 4-Parameter Logistic Function
f_4pl <- function(x, b, c, d, e) { c + (d - c) / (1 + exp(b * (log(x) - log(e)))) }
concs <- c(0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000)
# Parameters: b=Slope, c=Bottom, d=Top, e=IC50
true_y <- f_4pl(concs, b = -1.5, c = 5, d = 100, e = 50)
# Add biological replicates and noise
df_dr <- data.frame(
Conc = rep(concs, each=3)
)
df_dr$Resp <- rep(true_y, each=3) + rnorm(nrow(df_dr), 0, 5) # Noise
ggplot(df_dr, aes(x=Conc, y=Resp)) +
geom_point(size=2.5, alpha=0.6, color="#8e44ad") +
geom_function(fun = function(x) f_4pl(x, -1.5, 5, 100, 50), color="#2c3e50", size=1) +
scale_x_log10() +
theme_bw() +
labs(title="Drug Dose-Response (Sigmoidal)",
subtitle="IC50 = 50 nM (4-Parameter Logistic Fit)",
x="Concentration (nM) [Log]", y="Cell Viability (%)") +
geom_vline(xintercept=50, linetype="dashed", color="red")
Model performance metrics.
# Simulate scores for Positive and Negative classes
neg_scores <- rnorm(500, mean=0.3, sd=0.15)
pos_scores <- rnorm(500, mean=0.7, sd=0.15) # Better separation
# Calculate ROC points manually
thresholds <- seq(0, 1, 0.01)
tpr <- sapply(thresholds, function(t) mean(pos_scores > t))
fpr <- sapply(thresholds, function(t) mean(neg_scores > t))
df_roc <- data.frame(FPR=fpr, TPR=tpr)
ggplot(df_roc, aes(x=FPR, y=TPR)) +
geom_path(color="#27ae60", size=1.5) +
geom_abline(linetype="dashed", color="grey") +
theme_light() +
annotate("text", x=0.75, y=0.25, label="AUC = 0.96", size=6, color="#27ae60") +
labs(title="ROC Analysis: SNP Classifier", x="1 - Specificity (FPR)", y="Sensitivity (TPR)")